read Data

train <- read.csv("train.csv", stringsAsFactors = FALSE)
train <- train[,-1]

Delete the variables with too much NAs and impute Data 1. The missing Data for the BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2 are almost the same amounts. It can be estimated that the missing data in these variables are caused by same reason which might be the absence of the basement. 2. The MasVnrType and MasVnrArea are always NA together. That can caused by the fact that there is no masonry veneer which leads to NA.

dimension <- dim(train)
missingNum <- sapply(train, function(x) {sum(is.na(x))})
data <- train[, missingNum < 0.05 * dimension[1]]
missingNum <- sapply(data, function(x) {sum(is.na(x))})
data$BsmtExposure[which(is.na(data$BsmtExposure))] <- 'None'
data$BsmtFinType1[which(is.na(data$BsmtFinType1))] <- 'None'
data$BsmtFinType2[which(is.na(data$BsmtFinType2))] <- 'None'
data$BsmtQual[which(is.na(data$BsmtQual))] <- 'None'
data$BsmtCond[which(is.na(data$BsmtCond))] <- 'None'
missingNum <- sapply(data, function(x) {sum(is.na(x))})

After filling up the basement-related missing values, the other missing values should be imputed with MICE. The categorical variables “MasVnrType” and “Electrical” should be converted into factors before imputation of the data

library(mice)
data$MasVnrType[which(is.na(data$MasVnrArea))] <- "None"
data$MasVnrArea[which(is.na(data$MasVnrArea))] <- 0
data$Electrical <- as.factor(data$Electrical)
data_complete <- mice(data, m = 1, printFlag = FALSE)
data_complete <- complete(data_complete)

Make sure there are no missing values inside.

TotalMissingSum <- sum(sapply(data_complete, function(x) {sum(is.na(x))}))

Add the new features 1. Basement Square Feet(Type I and Type II) 2. 1st and 2nd Floor Square Feet 3. Wood Deck and Open Porch Square Feet 4. Basement Bathroom 5. Bathroom 6. Age of house (Year - YearBuilt) 7. Year of Last Remodel 8. High Quality Square Feet 9. Total Area

data_complete$BasementSF <- data_complete$BsmtFinSF1 + data_complete$BsmtFinSF2
data_complete$OneandTwoFloorSF <- data_complete$X1stFlrSF + data_complete$X2ndFlrSF
data_complete$FrontSF <- data_complete$WoodDeckSF + data_complete$OpenPorchSF
data_complete$BasementBath <- data_complete$BsmtFullBath + 0.5 * data_complete$BsmtHalfBath
data_complete$Bath <- data_complete$FullBath + 0.5 * data_complete$HalfBath
data_complete$Age <- data_complete$YrSold - data_complete$YearBuilt
data_complete$YrOfRemodel <- data_complete$YrSold - data_complete$YearRemodAdd
data_complete$HighQualSF <- data_complete$BsmtFinSF1 + data_complete$BsmtFinSF2 + data_complete$GrLivArea + data_complete$GarageArea + data_complete$WoodDeckSF + data_complete$OpenPorchSF
data_complete$TotalArea <- data_complete$GrLivArea+data_complete$TotalBsmtSF+data_complete$GarageArea+data_complete $LotArea+data_complete$MasVnrArea+data_complete$OpenPorchSF+data_complete$PoolArea+data_complete$ScreenPorch+data_complete$WoodDeckSF+data_complete$X3SsnPorch+data_complete$EnclosedPorch
data_complete$LogPrice <- log(data_complete$SalePrice)
data_complete <- data_complete[,-69] #get rid of SalePrice

Save the imputed data as future use

write.csv(data_complete, file = "data_complete.csv", row.names = FALSE)

PreProcess Test Dataset and Impute test dataset 1. Exterior1st and Exterior2nd are NA all together, so the reason for the missing data is that there is no covering for the house 2. MasVnrType and MasVnrArea are most of the times, NA together. This is probably caused by the fact that the masonry veneer is none

test <- read.csv("test.csv", stringsAsFactors = FALSE)
id <- test$Id
test <- test[,-1]
all_names <- colnames(data)
testdata <- test[,-c(3,6,57,58, 59, 60, 63,64,72, 73, 74)]
sapply(testdata, function(x) {sum(is.na(x))})
##    MSSubClass      MSZoning       LotArea        Street      LotShape 
##             0             4             0             0             0 
##   LandContour     Utilities     LotConfig     LandSlope  Neighborhood 
##             0             2             0             0             0 
##    Condition1    Condition2      BldgType    HouseStyle   OverallQual 
##             0             0             0             0             0 
##   OverallCond     YearBuilt  YearRemodAdd     RoofStyle      RoofMatl 
##             0             0             0             0             0 
##   Exterior1st   Exterior2nd    MasVnrType    MasVnrArea     ExterQual 
##             1             1            16            15             0 
##     ExterCond    Foundation      BsmtQual      BsmtCond  BsmtExposure 
##             0             0            44            45            44 
##  BsmtFinType1    BsmtFinSF1  BsmtFinType2    BsmtFinSF2     BsmtUnfSF 
##            42             1            42             1             1 
##   TotalBsmtSF       Heating     HeatingQC    CentralAir    Electrical 
##             1             0             0             0             0 
##     X1stFlrSF     X2ndFlrSF  LowQualFinSF     GrLivArea  BsmtFullBath 
##             0             0             0             0             2 
##  BsmtHalfBath      FullBath      HalfBath  BedroomAbvGr  KitchenAbvGr 
##             2             0             0             0             0 
##   KitchenQual  TotRmsAbvGrd    Functional    Fireplaces    GarageCars 
##             1             0             2             0             1 
##    GarageArea    PavedDrive    WoodDeckSF   OpenPorchSF EnclosedPorch 
##             1             0             0             0             0 
##    X3SsnPorch   ScreenPorch      PoolArea       MiscVal        MoSold 
##             0             0             0             0             0 
##        YrSold      SaleType SaleCondition 
##             0             1             0
testdata[c(28, 889), "BsmtExposure"] <- 'Unf'
testdata[c(758, 759), "BsmtQual"] <- 'None'
testdata$BsmtFinType1[which(is.na(testdata$BsmtExposure))] <- 'None'
testdata$BsmtFinType2[which(is.na(testdata$BsmtExposure))] <- 'None'
testdata$BsmtQual[which(is.na(testdata$BsmtExposure))] <- 'None'
testdata$BsmtCond[which(is.na(testdata$BsmtExposure))] <- 'None'
testdata$BsmtFinSF1[which(is.na(testdata$BsmtExposure))] <- 0
testdata$BsmtFinSF2[which(is.na(testdata$BsmtExposure))] <- 0
testdata$BsmtUnfSF[which(is.na(testdata$BsmtExposure))] <- 0
testdata$BsmtExposure[which(is.na(testdata$BsmtExposure))] <- 'None'
testdata$MSZoning = as.factor(testdata$MSZoning)
testdata$Exterior1st[which(is.na(testdata$Exterior1st))] <- 'None'
testdata$Exterior2nd[which(is.na(testdata$Exterior2nd))] <- 'None'
testdata$MasVnrType[which(is.na(testdata$MasVnrArea))] <- 'None'
testdata$MasVnrArea[which(is.na(testdata$MasVnrArea))] <- 0
testdata$MasVnrType = as.factor(testdata$MasVnrType)
testdata$KitchenQual = as.factor(testdata$KitchenQual)
testdata$Functional = as.factor(testdata$Functional)
testdata$SaleType = as.factor(testdata$SaleType)
testdata$BsmtCond = as.factor(testdata$BsmtCond)
testdata$Utilities[which(is.na(testdata$Utilities))] = 'AllPub'
test_complete <- mice(testdata, m = 1, method = 'cart', printFlag = FALSE)
test_complete <- complete(test_complete)
TotalMissingNum <- sum(sapply(test_complete, function(x) {sum(is.na(x))}))

Add New Features into the test dataset

test_complete$BasementSF <- test_complete$BsmtFinSF1 + test_complete$BsmtFinSF2
test_complete$OneandTwoFloorSF <- test_complete$X1stFlrSF + test_complete$X2ndFlrSF
test_complete$FrontSF <- test_complete$WoodDeckSF + test_complete$OpenPorchSF
test_complete$BasementBath <- test_complete$BsmtFullBath + 0.5 * test_complete$BsmtHalfBath
test_complete$Bath <- test_complete$FullBath + 0.5 * test_complete$HalfBath
test_complete$Age <- test_complete$YrSold - test_complete$YearBuilt
test_complete$YrOfRemodel <- test_complete$YrSold - test_complete$YearRemodAdd
test_complete$HighQualSF <- test_complete$BsmtFinSF1 + test_complete$BsmtFinSF2 + test_complete$GrLivArea + test_complete$GarageArea + test_complete$WoodDeckSF + test_complete$OpenPorchSF
test_complete$TotalArea <- test_complete$GrLivArea+test_complete$TotalBsmtSF+test_complete$GarageArea+test_complete$LotArea+test_complete$MasVnrArea+test_complete$OpenPorchSF+test_complete$PoolArea+test_complete$ScreenPorch+test_complete$WoodDeckSF+test_complete$X3SsnPorch+test_complete$EnclosedPorch
write.csv(test_complete, file = 'test_complete.csv', row.names = FALSE)

Read the imputed data

data_complete <- read.csv("data_complete.csv", header = TRUE)
test_complete <- read.csv("test_complete.csv", header = TRUE)
complete = rbind(data_complete[,-78], test_complete)
all_data <- model.matrix(~., complete)
train_data <- all_data[1:1460,]
test_data <- all_data[1461:2919,]

Split data into test and train set and Generate Linear Regression Model (Since the common occuring ratio for train vs test is around 80 vs 20 which is also refered as Pareto Principle, in this case, the train set is 80% of original data while test set is the rest 20%)

library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-10
ind <- train_data
dep <- data_complete$LogPrice
set.seed(12345)
train.ind <- sample(1:dimension[1], dimension[1] * 0.8)
train_ind <- ind[train.ind, ]
train_dep <- dep[train.ind]
test_ind <- ind[-train.ind, ]
test_dep <- dep[-train.ind]
fit.lasso <- glmnet(x = train_ind, y = train_dep, alpha = 1)
fit.ridge <- glmnet(x = train_ind, y = train_dep, alpha = 0)
fit.elnet <- glmnet(x = train_ind, y = train_dep, alpha = 0.5)
plot(fit.lasso, main = "LASSO", xvar = "lambda")

plot(fit.ridge, main = "RIDGE", xvar = "lambda")

plot(fit.elnet, main = "ELASTIC NET", xvar = "lambda")

Find the best model with cross validation method 1. Train 10 models with training dataset, each with different alpha (different combination of LASSO and RIDGE, 0 ~ 1) 2. Predict the test dataset and compare the mean square error 3. Compare and find the best alpha value

fit0 <- cv.glmnet(x = train_ind, y = train_dep, type.measure = "mse", alpha = 0.0, family = "gaussian")
fit1 <- cv.glmnet(x = train_ind, y = train_dep, type.measure = "mse", alpha = 0.1, family = "gaussian")
fit2 <- cv.glmnet(x = train_ind, y = train_dep, type.measure = "mse", alpha = 0.2, family = "gaussian")
fit3 <- cv.glmnet(x = train_ind, y = train_dep, type.measure = "mse", alpha = 0.3, family = "gaussian")
fit4 <- cv.glmnet(x = train_ind, y = train_dep, type.measure = "mse", alpha = 0.4, family = "gaussian")
fit5 <- cv.glmnet(x = train_ind, y = train_dep, type.measure = "mse", alpha = 0.5, family = "gaussian")
fit6 <- cv.glmnet(x = train_ind, y = train_dep, type.measure = "mse", alpha = 0.6, family = "gaussian")
fit7 <- cv.glmnet(x = train_ind, y = train_dep, type.measure = "mse", alpha = 0.7, family = "gaussian")
fit8 <- cv.glmnet(x = train_ind, y = train_dep, type.measure = "mse", alpha = 0.8, family = "gaussian")
fit9 <- cv.glmnet(x = train_ind, y = train_dep, type.measure = "mse", alpha = 0.9, family = "gaussian")
fit10 <- cv.glmnet(x = train_ind, y = train_dep, type.measure = "mse", alpha = 1.0, family = "gaussian")

pred0 <- predict(fit0, s = fit0$lambda.1se, newx = test_ind)
pred1 <- predict(fit1, s = fit1$lambda.1se, newx = test_ind)
pred2 <- predict(fit2, s = fit2$lambda.1se, newx = test_ind)
pred3 <- predict(fit3, s = fit3$lambda.1se, newx = test_ind)
pred4 <- predict(fit4, s = fit4$lambda.1se, newx = test_ind)
pred5 <- predict(fit5, s = fit5$lambda.1se, newx = test_ind)
pred6 <- predict(fit6, s = fit6$lambda.1se, newx = test_ind)
pred7 <- predict(fit7, s = fit7$lambda.1se, newx = test_ind)
pred8 <- predict(fit8, s = fit8$lambda.1se, newx = test_ind)
pred9 <- predict(fit9, s = fit9$lambda.1se, newx = test_ind)
pred10 <- predict(fit10, s = fit10$lambda.1se, newx = test_ind)


mse0 <- mean((test_dep - pred0)^2)
mse1 <- mean((test_dep - pred1)^2)
mse2 <- mean((test_dep - pred2)^2)
mse3 <- mean((test_dep - pred3)^2)
mse4 <- mean((test_dep - pred4)^2)
mse5 <- mean((test_dep - pred5)^2)
mse6 <- mean((test_dep - pred6)^2)
mse7 <- mean((test_dep - pred7)^2)
mse8 <- mean((test_dep - pred8)^2)
mse9 <- mean((test_dep - pred9)^2)
mse10 <- mean((test_dep - pred10)^2)

Plot Alpha vs MSE

alpha <- seq(0, 1, 0.1)
mse <- c(mse0, mse1, mse2, mse3, mse4, mse5, mse6, mse7, mse8, mse9, mse10)
plot(alpha, mse, type='l', xlab="alpha", ylab="MSE")
text(alpha, mse, round(mse, 5), cex = 0.6, pos = 4, col = "red")

Choose the best alpha (choose alpha = 0.1) for the SalePrice Prediction The Score is 0.16360, which ranks 1654.

prediction <- predict(fit1, s = fit1$lambda.1se, newx = test_data)
df <- data.frame(cbind(id, exp(prediction)))
colnames(df) <- c('Id', 'SalePrice')
write.csv(df, file = "result.csv", row.names = FALSE)

Using all variables for Linear Regression The Reason for the NA values for some engineered features is that the engineered features are generated by other features with simple addition and substraction, which means: EngineeredFeatures = Feature1 + Feature2 + Feature3 + … Let’s assume: NewFeature = coef1 * Feature1 + coef2 * Feature2 + coef3 * Feature3 and the linear regression equation for the model is: Response = a * Feature1 + b * Feature2 + c * Feature3 + d * Feature4 + e * NewFeature The equation can be easily re-written into: Response = (a + e * coef1) * Feature1 + (b + e * coef2) * Feature2 + (c + e * coef3) * Feature3 + d * Feature4 So from this perspective: The newly generated Features are not useful for the model refinement

data_complete <- read.csv("data_complete.csv", header = TRUE)
test_complete <- read.csv("test_complete.csv", header = TRUE)
test_complete$LogPrice <- 0
complete = rbind(data_complete, test_complete)
all_data <- model.matrix(~., complete)
train_data <- all_data[1:1460,]
test_data <- all_data[1461:2919,]
train_feed1 <- data.frame(train_data)
test_feed1 <- data.frame(test_data)
lmModel1 <- lm(train_feed1$LogPrice ~ ., data = train_feed1)
summary(lmModel1)
## 
## Call:
## lm(formula = train_feed1$LogPrice ~ ., data = train_feed1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.69632 -0.04735  0.00349  0.05295  0.69632 
## 
## Coefficients: (18 not defined because of singularities)
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           7.093e+00  4.777e+00   1.485 0.137818    
## X.Intercept.                 NA         NA      NA       NA    
## MSSubClass           -3.991e-04  3.781e-04  -1.056 0.291309    
## MSZoningFV            4.405e-01  5.450e-02   8.083 1.48e-15 ***
## MSZoningRH            4.111e-01  5.419e-02   7.587 6.39e-14 ***
## MSZoningRL            4.131e-01  4.644e-02   8.895  < 2e-16 ***
## MSZoningRM            3.720e-01  4.347e-02   8.556  < 2e-16 ***
## LotArea               2.976e-06  4.936e-07   6.029 2.17e-09 ***
## StreetPave            1.166e-01  5.447e-02   2.141 0.032443 *  
## LotShapeIR2           2.986e-02  1.914e-02   1.560 0.119063    
## LotShapeIR3           1.906e-02  4.002e-02   0.476 0.633945    
## LotShapeReg           4.951e-03  7.358e-03   0.673 0.501111    
## LandContourHLS        3.072e-02  2.351e-02   1.306 0.191685    
## LandContourLow       -2.344e-02  2.926e-02  -0.801 0.423297    
## LandContourLvl        2.798e-02  1.685e-02   1.661 0.096996 .  
## UtilitiesNoSeWa      -2.187e-01  1.184e-01  -1.847 0.065016 .  
## LotConfigCulDSac      2.648e-02  1.462e-02   1.812 0.070272 .  
## LotConfigFR2         -3.908e-02  1.831e-02  -2.135 0.032983 *  
## LotConfigFR3         -9.870e-02  5.768e-02  -1.711 0.087294 .  
## LotConfigInside      -1.355e-02  7.987e-03  -1.696 0.090157 .  
## LandSlopeMod          3.107e-02  1.815e-02   1.712 0.087101 .  
## LandSlopeSev         -1.952e-01  5.200e-02  -3.754 0.000182 ***
## NeighborhoodBlueste  -5.018e-02  8.715e-02  -0.576 0.564832    
## NeighborhoodBrDale   -6.606e-02  5.006e-02  -1.319 0.187273    
## NeighborhoodBrkSide   1.049e-02  4.282e-02   0.245 0.806585    
## NeighborhoodClearCr   1.903e-02  4.209e-02   0.452 0.651175    
## NeighborhoodCollgCr  -2.086e-02  3.278e-02  -0.636 0.524608    
## NeighborhoodCrawfor   1.073e-01  3.884e-02   2.762 0.005830 ** 
## NeighborhoodEdwards  -8.568e-02  3.623e-02  -2.365 0.018202 *  
## NeighborhoodGilbert  -1.143e-02  3.511e-02  -0.325 0.744881    
## NeighborhoodIDOTRR   -3.006e-02  4.864e-02  -0.618 0.536739    
## NeighborhoodMeadowV  -1.672e-01  5.096e-02  -3.281 0.001063 ** 
## NeighborhoodMitchel  -6.140e-02  3.709e-02  -1.655 0.098081 .  
## NeighborhoodNAmes    -3.810e-02  3.550e-02  -1.073 0.283432    
## NeighborhoodNoRidge   3.368e-02  3.834e-02   0.878 0.379878    
## NeighborhoodNPkVill  -2.206e-03  6.376e-02  -0.035 0.972400    
## NeighborhoodNridgHt   7.584e-02  3.368e-02   2.252 0.024519 *  
## NeighborhoodNWAmes   -4.252e-02  3.644e-02  -1.167 0.243574    
## NeighborhoodOldTown  -5.663e-02  4.361e-02  -1.299 0.194326    
## NeighborhoodSawyer   -2.636e-02  3.699e-02  -0.713 0.476212    
## NeighborhoodSawyerW  -8.628e-03  3.535e-02  -0.244 0.807196    
## NeighborhoodSomerst   1.839e-02  4.094e-02   0.449 0.653413    
## NeighborhoodStoneBr   1.305e-01  3.779e-02   3.453 0.000573 ***
## NeighborhoodSWISU    -9.942e-04  4.377e-02  -0.023 0.981882    
## NeighborhoodTimber    1.793e-04  3.705e-02   0.005 0.996141    
## NeighborhoodVeenker   4.206e-02  4.787e-02   0.879 0.379788    
## Condition1Feedr       2.265e-02  2.246e-02   1.008 0.313482    
## Condition1Norm        7.374e-02  1.858e-02   3.970 7.61e-05 ***
## Condition1PosA        3.782e-02  4.536e-02   0.834 0.404534    
## Condition1PosN        7.678e-02  3.365e-02   2.282 0.022665 *  
## Condition1RRAe       -5.077e-02  4.116e-02  -1.233 0.217720    
## Condition1RRAn        3.012e-02  3.097e-02   0.972 0.331095    
## Condition1RRNe        5.254e-03  8.061e-02   0.065 0.948043    
## Condition1RRNn        8.366e-02  5.811e-02   1.440 0.150198    
## Condition2Feedr       1.206e-01  1.014e-01   1.189 0.234490    
## Condition2Norm        5.611e-02  8.660e-02   0.648 0.517121    
## Condition2PosA        2.335e-01  1.673e-01   1.396 0.163096    
## Condition2PosN       -8.105e-01  1.219e-01  -6.648 4.45e-11 ***
## Condition2RRAe       -5.341e-01  2.079e-01  -2.569 0.010306 *  
## Condition2RRAn       -2.762e-02  1.402e-01  -0.197 0.843789    
## Condition2RRNn        2.919e-02  1.198e-01   0.244 0.807569    
## BldgType2fmCon        4.890e-02  5.698e-02   0.858 0.390946    
## BldgTypeDuplex       -1.098e-02  3.314e-02  -0.331 0.740528    
## BldgTypeTwnhs        -5.434e-02  4.522e-02  -1.202 0.229721    
## BldgTypeTwnhsE       -7.864e-03  4.077e-02  -0.193 0.847087    
## HouseStyle1.5Unf      3.944e-03  3.487e-02   0.113 0.909969    
## HouseStyle1Story     -3.268e-02  1.919e-02  -1.703 0.088828 .  
## HouseStyle2.5Fin     -5.205e-02  5.416e-02  -0.961 0.336692    
## HouseStyle2.5Unf      5.967e-02  4.129e-02   1.445 0.148684    
## HouseStyle2Story     -1.658e-02  1.573e-02  -1.054 0.292068    
## HouseStyleSFoyer     -1.737e-02  2.829e-02  -0.614 0.539422    
## HouseStyleSLvl       -2.526e-03  2.483e-02  -0.102 0.918991    
## OverallQual           4.411e-02  4.585e-03   9.621  < 2e-16 ***
## OverallCond           3.771e-02  3.958e-03   9.527  < 2e-16 ***
## YearBuilt             1.699e-03  3.370e-04   5.042 5.30e-07 ***
## YearRemodAdd          7.517e-04  2.491e-04   3.018 0.002596 ** 
## RoofStyleGable       -1.365e-02  8.354e-02  -0.163 0.870189    
## RoofStyleGambrel      2.194e-03  9.123e-02   0.024 0.980820    
## RoofStyleHip         -9.934e-03  8.378e-02  -0.119 0.905640    
## RoofStyleMansard      5.550e-02  9.727e-02   0.571 0.568410    
## RoofStyleShed         4.734e-01  1.582e-01   2.993 0.002818 ** 
## RoofMatlCompShg       2.603e+00  1.497e-01  17.392  < 2e-16 ***
## RoofMatlMembran       2.999e+00  2.177e-01  13.775  < 2e-16 ***
## RoofMatlMetal         2.860e+00  2.134e-01  13.404  < 2e-16 ***
## RoofMatlRoll          2.624e+00  1.882e-01  13.939  < 2e-16 ***
## RoofMatlTar.Grv       2.635e+00  1.725e-01  15.279  < 2e-16 ***
## RoofMatlWdShake       2.531e+00  1.656e-01  15.278  < 2e-16 ***
## RoofMatlWdShngl       2.701e+00  1.549e-01  17.437  < 2e-16 ***
## Exterior1stAsphShn    2.247e-02  1.513e-01   0.148 0.881992    
## Exterior1stBrkComm   -1.788e-01  1.258e-01  -1.421 0.155470    
## Exterior1stBrkFace    1.120e-01  5.672e-02   1.975 0.048435 *  
## Exterior1stCBlock    -4.665e-02  1.235e-01  -0.378 0.705790    
## Exterior1stCemntBd   -6.269e-02  8.565e-02  -0.732 0.464356    
## Exterior1stHdBoard    2.085e-02  5.731e-02   0.364 0.716004    
## Exterior1stImStucc    7.525e-03  1.257e-01   0.060 0.952275    
## Exterior1stMetalSd    6.372e-02  6.533e-02   0.975 0.329545    
## Exterior1stPlywood    2.038e-02  5.665e-02   0.360 0.719148    
## Exterior1stStone      8.705e-02  1.100e-01   0.791 0.428930    
## Exterior1stStucco     5.388e-02  6.219e-02   0.866 0.386495    
## Exterior1stVinylSd    2.468e-02  5.924e-02   0.417 0.677083    
## Exterior1stWd.Sdng   -9.551e-03  5.465e-02  -0.175 0.861293    
## Exterior1stWdShing    2.510e-02  5.908e-02   0.425 0.671051    
## Exterior1stNone              NA         NA      NA       NA    
## Exterior2ndAsphShn    1.185e-03  1.005e-01   0.012 0.990589    
## Exterior2ndBrk.Cmn    1.002e-02  9.114e-02   0.110 0.912456    
## Exterior2ndBrkFace   -5.371e-02  5.898e-02  -0.911 0.362636    
## Exterior2ndCBlock            NA         NA      NA       NA    
## Exterior2ndCmentBd    1.109e-01  8.436e-02   1.315 0.188681    
## Exterior2ndHdBoard   -7.326e-03  5.533e-02  -0.132 0.894685    
## Exterior2ndImStucc    1.156e-02  6.376e-02   0.181 0.856123    
## Exterior2ndMetalSd   -2.099e-02  6.380e-02  -0.329 0.742201    
## Exterior2ndOther     -1.047e-01  1.246e-01  -0.841 0.400602    
## Exterior2ndPlywood   -3.483e-03  5.363e-02  -0.065 0.948233    
## Exterior2ndStone     -8.308e-02  7.703e-02  -1.079 0.281011    
## Exterior2ndStucco    -1.526e-02  6.004e-02  -0.254 0.799334    
## Exterior2ndVinylSd    1.221e-02  5.724e-02   0.213 0.831077    
## Exterior2ndWd.Sdng    3.178e-02  5.295e-02   0.600 0.548494    
## Exterior2ndWd.Shng   -1.393e-02  5.489e-02  -0.254 0.799711    
## Exterior2ndNone              NA         NA      NA       NA    
## MasVnrTypeBrkFace     3.372e-02  3.102e-02   1.087 0.277264    
## MasVnrTypeNone        2.674e-02  3.128e-02   0.855 0.392765    
## MasVnrTypeStone       4.229e-02  3.280e-02   1.289 0.197584    
## MasVnrArea            1.032e-05  2.636e-05   0.392 0.695473    
## ExterQualFa           2.861e-02  4.965e-02   0.576 0.564543    
## ExterQualGd           1.037e-02  2.195e-02   0.473 0.636576    
## ExterQualTA           1.554e-02  2.427e-02   0.640 0.522223    
## ExterCondFa          -9.787e-02  8.279e-02  -1.182 0.237418    
## ExterCondGd          -6.816e-02  7.912e-02  -0.862 0.389087    
## ExterCondPo          -1.026e-01  1.441e-01  -0.712 0.476602    
## ExterCondTA          -5.020e-02  7.894e-02  -0.636 0.524961    
## FoundationCBlock      2.411e-02  1.436e-02   1.680 0.093280 .  
## FoundationPConc       4.026e-02  1.549e-02   2.599 0.009453 ** 
## FoundationSlab       -3.188e-02  4.567e-02  -0.698 0.485320    
## FoundationStone       1.292e-01  4.918e-02   2.627 0.008731 ** 
## FoundationWood       -1.184e-01  6.658e-02  -1.778 0.075613 .  
## BsmtQualFa           -3.438e-02  2.884e-02  -1.192 0.233428    
## BsmtQualGd           -2.744e-02  1.508e-02  -1.820 0.068924 .  
## BsmtQualNone          1.532e-01  1.688e-01   0.908 0.364259    
## BsmtQualTA           -3.512e-02  1.866e-02  -1.882 0.060021 .  
## BsmtCondGd            1.807e-02  2.394e-02   0.755 0.450447    
## BsmtCondNone                 NA         NA      NA       NA    
## BsmtCondPo            3.218e-01  1.368e-01   2.353 0.018783 *  
## BsmtCondTA            1.918e-02  1.912e-02   1.003 0.315985    
## BsmtExposureGd        2.725e-02  1.375e-02   1.982 0.047744 *  
## BsmtExposureMn       -7.535e-03  1.382e-02  -0.545 0.585733    
## BsmtExposureNo       -1.172e-02  1.003e-02  -1.168 0.242975    
## BsmtExposureNone     -5.027e-02  1.064e-01  -0.472 0.636743    
## BsmtExposureUnf              NA         NA      NA       NA    
## BsmtFinType1BLQ      -4.681e-03  1.259e-02  -0.372 0.710135    
## BsmtFinType1GLQ       1.042e-02  1.152e-02   0.905 0.365842    
## BsmtFinType1LwQ      -2.367e-02  1.703e-02  -1.390 0.164831    
## BsmtFinType1None             NA         NA      NA       NA    
## BsmtFinType1Rec      -6.503e-03  1.361e-02  -0.478 0.632988    
## BsmtFinType1Unf      -1.394e-02  1.333e-02  -1.046 0.295919    
## BsmtFinSF1            1.394e-04  2.397e-05   5.815 7.69e-09 ***
## BsmtFinType2BLQ      -7.004e-02  3.454e-02  -2.028 0.042819 *  
## BsmtFinType2GLQ      -1.893e-03  4.267e-02  -0.044 0.964615    
## BsmtFinType2LwQ      -3.667e-02  3.368e-02  -1.089 0.276432    
## BsmtFinType2None     -1.311e-01  1.155e-01  -1.135 0.256631    
## BsmtFinType2Rec      -2.760e-02  3.248e-02  -0.850 0.395688    
## BsmtFinType2Unf      -1.441e-02  3.454e-02  -0.417 0.676538    
## BsmtFinSF2            1.377e-04  4.147e-05   3.320 0.000926 ***
## BsmtUnfSF             8.112e-05  2.188e-05   3.708 0.000218 ***
## TotalBsmtSF                  NA         NA      NA       NA    
## HeatingGasA           1.574e-01  1.159e-01   1.357 0.174879    
## HeatingGasW           2.215e-01  1.191e-01   1.860 0.063190 .  
## HeatingGrav           8.893e-03  1.256e-01   0.071 0.943550    
## HeatingOthW           1.388e-01  1.430e-01   0.971 0.331808    
## HeatingWall           2.582e-01  1.343e-01   1.922 0.054852 .  
## HeatingQCFa          -2.330e-02  2.136e-02  -1.091 0.275459    
## HeatingQCGd          -2.186e-02  9.435e-03  -2.316 0.020694 *  
## HeatingQCPo          -1.023e-01  1.225e-01  -0.835 0.404048    
## HeatingQCTA          -3.371e-02  9.394e-03  -3.588 0.000346 ***
## CentralAirY           6.738e-02  1.766e-02   3.814 0.000143 ***
## ElectricalFuseF      -2.468e-04  2.644e-02  -0.009 0.992553    
## ElectricalFuseP      -3.970e-02  7.822e-02  -0.508 0.611828    
## ElectricalMix        -1.889e-01  1.832e-01  -1.031 0.302911    
## ElectricalSBrkr      -1.429e-02  1.347e-02  -1.061 0.288977    
## X1stFlrSF             2.355e-04  2.526e-05   9.323  < 2e-16 ***
## X2ndFlrSF             2.113e-04  2.354e-05   8.977  < 2e-16 ***
## LowQualFinSF          1.905e-04  8.238e-05   2.312 0.020917 *  
## GrLivArea                    NA         NA      NA       NA    
## BsmtFullBath          2.286e-02  9.026e-03   2.533 0.011446 *  
## BsmtHalfBath          3.161e-04  1.383e-02   0.023 0.981766    
## FullBath              1.931e-02  9.937e-03   1.943 0.052267 .  
## HalfBath              2.142e-02  9.496e-03   2.255 0.024285 *  
## BedroomAbvGr          5.560e-03  6.177e-03   0.900 0.368266    
## KitchenAbvGr         -3.857e-02  2.558e-02  -1.508 0.131865    
## KitchenQualFa        -5.759e-02  2.825e-02  -2.038 0.041713 *  
## KitchenQualGd        -6.342e-02  1.559e-02  -4.069 5.01e-05 ***
## KitchenQualTA        -6.466e-02  1.767e-02  -3.658 0.000265 ***
## TotRmsAbvGrd          5.013e-03  4.309e-03   1.164 0.244812    
## FunctionalMaj2       -2.313e-01  6.600e-02  -3.504 0.000475 ***
## FunctionalMin1        3.666e-02  3.924e-02   0.934 0.350385    
## FunctionalMin2        3.063e-02  3.916e-02   0.782 0.434246    
## FunctionalMod        -6.043e-02  4.782e-02  -1.264 0.206551    
## FunctionalSev        -2.838e-01  1.265e-01  -2.243 0.025043 *  
## FunctionalTyp         6.751e-02  3.395e-02   1.988 0.046988 *  
## Fireplaces            2.422e-02  6.071e-03   3.989 7.01e-05 ***
## GarageCars            2.377e-02  9.805e-03   2.424 0.015490 *  
## GarageArea            1.246e-04  3.384e-05   3.683 0.000241 ***
## PavedDriveP           1.573e-02  2.460e-02   0.639 0.522639    
## PavedDriveY           2.345e-02  1.546e-02   1.517 0.129567    
## WoodDeckSF            9.299e-05  2.647e-05   3.513 0.000460 ***
## OpenPorchSF           5.752e-05  5.249e-05   1.096 0.273303    
## EnclosedPorch         1.234e-04  5.654e-05   2.183 0.029207 *  
## X3SsnPorch            1.654e-04  1.019e-04   1.623 0.104752    
## ScreenPorch           2.693e-04  5.547e-05   4.855 1.36e-06 ***
## PoolArea              1.613e-04  8.238e-05   1.959 0.050382 .  
## MiscVal               9.047e-08  6.479e-06   0.014 0.988862    
## MoSold               -6.768e-04  1.120e-03  -0.604 0.545660    
## YrSold               -2.262e-03  2.350e-03  -0.963 0.335767    
## SaleTypeCon           8.380e-02  8.086e-02   1.036 0.300236    
## SaleTypeConLD         1.360e-01  4.413e-02   3.082 0.002104 ** 
## SaleTypeConLI        -4.071e-02  5.236e-02  -0.778 0.436942    
## SaleTypeConLw         9.097e-03  5.540e-02   0.164 0.869589    
## SaleTypeCWD           6.371e-02  5.923e-02   1.076 0.282265    
## SaleTypeNew           7.478e-02  7.106e-02   1.052 0.292831    
## SaleTypeOth           6.229e-02  6.623e-02   0.940 0.347146    
## SaleTypeWD           -2.178e-02  1.914e-02  -1.138 0.255431    
## SaleConditionAdjLand  1.073e-01  6.627e-02   1.619 0.105643    
## SaleConditionAlloca   7.345e-02  3.900e-02   1.883 0.059885 .  
## SaleConditionFamily   1.596e-02  2.786e-02   0.573 0.566971    
## SaleConditionNormal   6.782e-02  1.318e-02   5.146 3.09e-07 ***
## SaleConditionPartial  1.788e-02  6.845e-02   0.261 0.793970    
## BasementSF                   NA         NA      NA       NA    
## OneandTwoFloorSF             NA         NA      NA       NA    
## FrontSF                      NA         NA      NA       NA    
## BasementBath                 NA         NA      NA       NA    
## Bath                         NA         NA      NA       NA    
## Age                          NA         NA      NA       NA    
## YrOfRemodel                  NA         NA      NA       NA    
## HighQualSF                   NA         NA      NA       NA    
## TotalArea                    NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1049 on 1244 degrees of freedom
## Multiple R-squared:  0.9412, Adjusted R-squared:  0.9311 
## F-statistic: 92.65 on 215 and 1244 DF,  p-value: < 2.2e-16
plot(lmModel1)
## Warning: not plotting observations with leverage one:
##   121, 251, 272, 333, 376, 399, 584, 596, 667, 945, 949, 1004, 1012, 1188, 1231, 1271, 1276, 1299, 1322, 1371

## Warning: not plotting observations with leverage one:
##   121, 251, 272, 333, 376, 399, 584, 596, 667, 945, 949, 1004, 1012, 1188, 1231, 1271, 1276, 1299, 1322, 1371

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

Take out the variables that are most significant and do the linear regression again

train_data <- data_complete[, c(2, 3, 10, 11, 12, 15, 16, 17, 20, 32, 34, 35, 38, 39, 41, 42, 45, 51, 53, 54, 58, 62, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78)]
test_data <- test_complete[, c(2, 3, 10, 11, 12, 15, 16, 17, 20, 32, 34, 35, 38, 39, 41, 42, 45, 51, 53, 54, 58, 62, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78)]
complete <- rbind(train_data, test_data)
all_data <- model.matrix(~., complete)
train_feed2 <- data.frame(all_data[1:1460, ])
test_feed2 <- data.frame(all_data[1461:2919, ])
lmModel2 <- lm(train_feed2$LogPrice ~ ., data = train_feed2)
summary(lmModel2)
## 
## Call:
## lm(formula = train_feed2$LogPrice ~ ., data = train_feed2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.73397 -0.05346  0.00243  0.05756  0.73397 
## 
## Coefficients: (3 not defined because of singularities)
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           5.803e+00  4.657e+00   1.246 0.213024    
## X.Intercept.                 NA         NA      NA       NA    
## MSZoningFV            3.649e-01  5.328e-02   6.849 1.12e-11 ***
## MSZoningRH            3.500e-01  5.327e-02   6.570 7.13e-11 ***
## MSZoningRL            3.736e-01  4.462e-02   8.372  < 2e-16 ***
## MSZoningRM            3.183e-01  4.196e-02   7.585 6.09e-14 ***
## LotArea              -3.304e-05  1.884e-05  -1.754 0.079645 .  
## NeighborhoodBlueste  -5.450e-02  8.630e-02  -0.631 0.527825    
## NeighborhoodBrDale   -1.116e-01  4.483e-02  -2.491 0.012871 *  
## NeighborhoodBrkSide   5.553e-02  3.719e-02   1.493 0.135664    
## NeighborhoodClearCr   5.660e-02  3.825e-02   1.480 0.139234    
## NeighborhoodCollgCr   2.063e-02  2.953e-02   0.699 0.484965    
## NeighborhoodCrawfor   1.479e-01  3.517e-02   4.205 2.78e-05 ***
## NeighborhoodEdwards  -3.385e-02  3.273e-02  -1.034 0.301182    
## NeighborhoodGilbert   4.142e-02  3.133e-02   1.322 0.186295    
## NeighborhoodIDOTRR    2.987e-02  4.270e-02   0.699 0.484414    
## NeighborhoodMeadowV  -1.555e-01  4.403e-02  -3.533 0.000425 ***
## NeighborhoodMitchel  -2.168e-02  3.339e-02  -0.649 0.516330    
## NeighborhoodNAmes     7.881e-03  3.132e-02   0.252 0.801330    
## NeighborhoodNoRidge   5.259e-02  3.492e-02   1.506 0.132284    
## NeighborhoodNPkVill  -3.213e-02  4.819e-02  -0.667 0.505044    
## NeighborhoodNridgHt   8.038e-02  3.147e-02   2.554 0.010751 *  
## NeighborhoodNWAmes   -1.200e-02  3.280e-02  -0.366 0.714628    
## NeighborhoodOldTown  -2.501e-04  3.801e-02  -0.007 0.994750    
## NeighborhoodSawyer    5.545e-03  3.316e-02   0.167 0.867230    
## NeighborhoodSawyerW   5.176e-03  3.246e-02   0.159 0.873326    
## NeighborhoodSomerst   5.698e-02  3.788e-02   1.504 0.132793    
## NeighborhoodStoneBr   1.233e-01  3.632e-02   3.396 0.000703 ***
## NeighborhoodSWISU     2.516e-02  4.021e-02   0.626 0.531597    
## NeighborhoodTimber    3.073e-02  3.410e-02   0.901 0.367688    
## NeighborhoodVeenker   8.350e-02  4.493e-02   1.858 0.063351 .  
## Condition1Feedr       1.771e-02  2.232e-02   0.794 0.427480    
## Condition1Norm        6.647e-02  1.829e-02   3.635 0.000288 ***
## Condition1PosA        5.474e-02  4.558e-02   1.201 0.229936    
## Condition1PosN        7.462e-02  3.352e-02   2.226 0.026166 *  
## Condition1RRAe       -4.987e-02  4.186e-02  -1.191 0.233746    
## Condition1RRAn        3.938e-02  3.092e-02   1.273 0.203107    
## Condition1RRNe        4.677e-02  8.298e-02   0.564 0.573048    
## Condition1RRNn        4.750e-02  5.575e-02   0.852 0.394287    
## Condition2Feedr       1.126e-01  9.697e-02   1.162 0.245598    
## Condition2Norm        7.517e-02  8.249e-02   0.911 0.362371    
## Condition2PosA        2.306e-01  1.418e-01   1.626 0.104119    
## Condition2PosN       -8.847e-01  1.202e-01  -7.361 3.13e-13 ***
## Condition2RRAe       -7.722e-02  1.405e-01  -0.550 0.582635    
## Condition2RRAn       -4.596e-02  1.417e-01  -0.324 0.745706    
## Condition2RRNn        5.794e-02  1.167e-01   0.496 0.619626    
## OverallQual           5.442e-02  4.239e-03  12.839  < 2e-16 ***
## OverallCond           3.705e-02  3.626e-03  10.217  < 2e-16 ***
## YearBuilt             7.967e-04  2.315e-03   0.344 0.730815    
## RoofMatlCompShg       2.791e+00  1.309e-01  21.319  < 2e-16 ***
## RoofMatlMembran       3.056e+00  1.794e-01  17.032  < 2e-16 ***
## RoofMatlMetal         2.875e+00  1.751e-01  16.417  < 2e-16 ***
## RoofMatlRoll          2.760e+00  1.711e-01  16.132  < 2e-16 ***
## RoofMatlTar.Grv       2.813e+00  1.360e-01  20.678  < 2e-16 ***
## RoofMatlWdShake       2.774e+00  1.415e-01  19.600  < 2e-16 ***
## RoofMatlWdShngl       2.859e+00  1.372e-01  20.833  < 2e-16 ***
## BsmtFinSF1           -3.064e-05  2.471e-05  -1.240 0.215116    
## BsmtFinSF2           -6.366e-05  2.992e-05  -2.128 0.033538 *  
## BsmtUnfSF             5.634e-05  2.403e-05   2.344 0.019221 *  
## HeatingQCFa          -3.736e-02  1.922e-02  -1.944 0.052070 .  
## HeatingQCGd          -2.208e-02  9.445e-03  -2.338 0.019546 *  
## HeatingQCPo          -2.249e-03  1.205e-01  -0.019 0.985105    
## HeatingQCTA          -3.527e-02  9.036e-03  -3.903 9.95e-05 ***
## CentralAirY           7.002e-02  1.517e-02   4.617 4.26e-06 ***
## X1stFlrSF             6.519e-05  2.843e-05   2.293 0.021998 *  
## X2ndFlrSF             5.560e-05  2.426e-05   2.292 0.022068 *  
## BsmtFullBath          1.306e-02  2.594e-02   0.504 0.614626    
## KitchenQualFa        -9.053e-02  2.638e-02  -3.432 0.000616 ***
## KitchenQualGd        -6.538e-02  1.435e-02  -4.557 5.66e-06 ***
## KitchenQualTA        -7.231e-02  1.676e-02  -4.315 1.71e-05 ***
## FunctionalMaj2       -1.629e-01  6.011e-02  -2.710 0.006816 ** 
## FunctionalMin1        6.772e-02  3.753e-02   1.804 0.071386 .  
## FunctionalMin2        7.184e-02  3.714e-02   1.934 0.053280 .  
## FunctionalMod        -3.628e-02  4.379e-02  -0.829 0.407524    
## FunctionalSev        -3.334e-01  1.238e-01  -2.693 0.007165 ** 
## FunctionalTyp         1.066e-01  3.180e-02   3.353 0.000822 ***
## Fireplaces            2.656e-02  6.024e-03   4.409 1.12e-05 ***
## WoodDeckSF            9.812e-06  5.691e-05   0.172 0.863128    
## ScreenPorch           2.124e-04  5.870e-05   3.619 0.000307 ***
## SaleConditionAdjLand  1.104e-01  5.972e-02   1.849 0.064718 .  
## SaleConditionAlloca   4.383e-02  3.650e-02   1.201 0.230108    
## SaleConditionFamily   6.564e-03  2.785e-02   0.236 0.813727    
## SaleConditionNormal   7.119e-02  1.221e-02   5.828 6.97e-09 ***
## SaleConditionPartial  1.264e-01  1.706e-02   7.409 2.22e-13 ***
## BasementSF                   NA         NA      NA       NA    
## OneandTwoFloorSF             NA         NA      NA       NA    
## FrontSF              -1.118e-04  5.477e-05  -2.041 0.041423 *  
## BasementBath          9.121e-03  2.717e-02   0.336 0.737129    
## Bath                  1.389e-02  9.405e-03   1.477 0.140023    
## Age                  -1.313e-03  2.304e-03  -0.570 0.568775    
## YrOfRemodel          -6.994e-04  2.383e-04  -2.934 0.003397 ** 
## HighQualSF            1.629e-04  2.733e-05   5.961 3.18e-09 ***
## TotalArea             3.498e-05  1.883e-05   1.857 0.063484 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1114 on 1370 degrees of freedom
## Multiple R-squared:  0.927,  Adjusted R-squared:  0.9223 
## F-statistic: 195.5 on 89 and 1370 DF,  p-value: < 2.2e-16
plot(lmModel2)
## Warning: not plotting observations with leverage one:
##   326, 584, 667, 1004, 1231, 1276, 1299

## Warning: not plotting observations with leverage one:
##   326, 584, 667, 1004, 1231, 1276, 1299

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

From the Four plots, it is very clear that the point 826 and 524 are very unusual and it might be outliers because of the high cook’s distance and also high leverage as well as high residuals. They can be classified as outliers with high leverage and high residuals. It has very high influential power towards the result of the model. With or without these two outliers, the results can be quite different.

train_feed3 <- train_feed2[-c(524, 826),]
lmModel3 <- lm(train_feed3$LogPrice~., data = train_feed3)
summary(lmModel3)
## 
## Call:
## lm(formula = train_feed3$LogPrice ~ ., data = train_feed3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.69474 -0.05243  0.00152  0.05744  0.50044 
## 
## Coefficients: (4 not defined because of singularities)
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           5.457e+00  4.499e+00   1.213 0.225377    
## X.Intercept.                 NA         NA      NA       NA    
## MSZoningFV            3.684e-01  5.148e-02   7.157 1.34e-12 ***
## MSZoningRH            3.521e-01  5.146e-02   6.841 1.18e-11 ***
## MSZoningRL            3.757e-01  4.311e-02   8.715  < 2e-16 ***
## MSZoningRM            3.189e-01  4.054e-02   7.866 7.37e-15 ***
## LotArea              -3.054e-05  1.820e-05  -1.678 0.093552 .  
## NeighborhoodBlueste  -4.547e-02  8.338e-02  -0.545 0.585586    
## NeighborhoodBrDale   -1.033e-01  4.331e-02  -2.385 0.017197 *  
## NeighborhoodBrkSide   6.657e-02  3.595e-02   1.852 0.064277 .  
## NeighborhoodClearCr   5.260e-02  3.696e-02   1.423 0.154906    
## NeighborhoodCollgCr   1.786e-02  2.853e-02   0.626 0.531355    
## NeighborhoodCrawfor   1.523e-01  3.398e-02   4.482 8.01e-06 ***
## NeighborhoodEdwards  -2.037e-02  3.165e-02  -0.644 0.519912    
## NeighborhoodGilbert   3.909e-02  3.026e-02   1.292 0.196663    
## NeighborhoodIDOTRR    4.216e-02  4.127e-02   1.022 0.307191    
## NeighborhoodMeadowV  -1.491e-01  4.254e-02  -3.505 0.000472 ***
## NeighborhoodMitchel  -2.116e-02  3.226e-02  -0.656 0.512023    
## NeighborhoodNAmes     1.246e-02  3.026e-02   0.412 0.680482    
## NeighborhoodNoRidge   3.949e-02  3.376e-02   1.170 0.242320    
## NeighborhoodNPkVill  -2.662e-02  4.656e-02  -0.572 0.567581    
## NeighborhoodNridgHt   6.311e-02  3.045e-02   2.073 0.038402 *  
## NeighborhoodNWAmes   -1.231e-02  3.169e-02  -0.388 0.697785    
## NeighborhoodOldTown   9.786e-03  3.673e-02   0.266 0.789951    
## NeighborhoodSawyer    1.054e-02  3.204e-02   0.329 0.742364    
## NeighborhoodSawyerW   3.113e-03  3.136e-02   0.099 0.920919    
## NeighborhoodSomerst   5.109e-02  3.660e-02   1.396 0.163020    
## NeighborhoodStoneBr   1.171e-01  3.509e-02   3.337 0.000869 ***
## NeighborhoodSWISU     3.182e-02  3.885e-02   0.819 0.412967    
## NeighborhoodTimber    2.545e-02  3.294e-02   0.772 0.439961    
## NeighborhoodVeenker   8.338e-02  4.341e-02   1.921 0.054953 .  
## Condition1Feedr       1.736e-02  2.156e-02   0.805 0.420809    
## Condition1Norm        6.638e-02  1.767e-02   3.757 0.000179 ***
## Condition1PosA        4.985e-02  4.404e-02   1.132 0.257832    
## Condition1PosN        6.783e-02  3.239e-02   2.094 0.036432 *  
## Condition1RRAe       -5.299e-02  4.044e-02  -1.310 0.190301    
## Condition1RRAn        3.646e-02  2.988e-02   1.220 0.222543    
## Condition1RRNe        4.231e-02  8.016e-02   0.528 0.597681    
## Condition1RRNn        4.601e-02  5.385e-02   0.854 0.393097    
## Condition2Feedr       1.184e-01  9.368e-02   1.264 0.206498    
## Condition2Norm        7.626e-02  7.969e-02   0.957 0.338795    
## Condition2PosA        2.123e-01  1.370e-01   1.549 0.121516    
## Condition2PosN               NA         NA      NA       NA    
## Condition2RRAe       -9.479e-02  1.357e-01  -0.698 0.485044    
## Condition2RRAn       -5.037e-02  1.369e-01  -0.368 0.712959    
## Condition2RRNn        6.629e-02  1.127e-01   0.588 0.556636    
## OverallQual           5.336e-02  4.096e-03  13.027  < 2e-16 ***
## OverallCond           3.723e-02  3.503e-03  10.627  < 2e-16 ***
## YearBuilt             9.061e-04  2.237e-03   0.405 0.685437    
## RoofMatlCompShg       2.897e+00  1.269e-01  22.828  < 2e-16 ***
## RoofMatlMembran       3.182e+00  1.738e-01  18.308  < 2e-16 ***
## RoofMatlMetal         2.990e+00  1.696e-01  17.635  < 2e-16 ***
## RoofMatlRoll          2.860e+00  1.656e-01  17.273  < 2e-16 ***
## RoofMatlTar.Grv       2.914e+00  1.318e-01  22.108  < 2e-16 ***
## RoofMatlWdShake       2.879e+00  1.371e-01  20.993  < 2e-16 ***
## RoofMatlWdShngl       2.957e+00  1.329e-01  22.241  < 2e-16 ***
## BsmtFinSF1           -1.202e-05  2.394e-05  -0.502 0.615698    
## BsmtFinSF2           -4.878e-05  2.894e-05  -1.686 0.092106 .  
## BsmtUnfSF             6.306e-05  2.323e-05   2.715 0.006713 ** 
## HeatingQCFa          -3.575e-02  1.856e-02  -1.926 0.054316 .  
## HeatingQCGd          -2.141e-02  9.125e-03  -2.346 0.019100 *  
## HeatingQCPo           5.038e-03  1.164e-01   0.043 0.965472    
## HeatingQCTA          -3.479e-02  8.730e-03  -3.985 7.10e-05 ***
## CentralAirY           7.081e-02  1.465e-02   4.833 1.50e-06 ***
## X1stFlrSF             9.012e-05  2.758e-05   3.268 0.001112 ** 
## X2ndFlrSF             8.106e-05  2.358e-05   3.438 0.000603 ***
## BsmtFullBath          1.455e-02  2.506e-02   0.581 0.561545    
## KitchenQualFa        -9.101e-02  2.548e-02  -3.572 0.000367 ***
## KitchenQualGd        -6.593e-02  1.386e-02  -4.757 2.17e-06 ***
## KitchenQualTA        -7.314e-02  1.619e-02  -4.517 6.81e-06 ***
## FunctionalMaj2       -1.497e-01  5.808e-02  -2.578 0.010049 *  
## FunctionalMin1        7.607e-02  3.627e-02   2.098 0.036127 *  
## FunctionalMin2        7.843e-02  3.588e-02   2.186 0.028999 *  
## FunctionalMod        -3.640e-02  4.230e-02  -0.860 0.389703    
## FunctionalSev        -3.400e-01  1.196e-01  -2.843 0.004542 ** 
## FunctionalTyp         1.155e-01  3.074e-02   3.757 0.000179 ***
## Fireplaces            2.398e-02  5.825e-03   4.117 4.06e-05 ***
## WoodDeckSF           -3.575e-05  5.516e-05  -0.648 0.517018    
## ScreenPorch           2.073e-04  5.671e-05   3.655 0.000267 ***
## SaleConditionAdjLand  1.059e-01  5.769e-02   1.836 0.066618 .  
## SaleConditionAlloca   4.225e-02  3.526e-02   1.198 0.231036    
## SaleConditionFamily   5.921e-03  2.691e-02   0.220 0.825880    
## SaleConditionNormal   7.242e-02  1.180e-02   6.137 1.10e-09 ***
## SaleConditionPartial  1.281e-01  1.649e-02   7.773 1.50e-14 ***
## BasementSF                   NA         NA      NA       NA    
## OneandTwoFloorSF             NA         NA      NA       NA    
## FrontSF              -5.704e-05  5.320e-05  -1.072 0.283844    
## BasementBath          5.251e-03  2.625e-02   0.200 0.841457    
## Bath                  1.094e-02  9.090e-03   1.203 0.229010    
## Age                  -1.388e-03  2.226e-03  -0.623 0.533183    
## YrOfRemodel          -6.950e-04  2.303e-04  -3.019 0.002586 ** 
## HighQualSF            1.543e-04  2.642e-05   5.841 6.47e-09 ***
## TotalArea             3.262e-05  1.820e-05   1.793 0.073229 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1076 on 1369 degrees of freedom
## Multiple R-squared:  0.9317, Adjusted R-squared:  0.9273 
## F-statistic: 212.3 on 88 and 1369 DF,  p-value: < 2.2e-16
plot(lmModel3)
## Warning: not plotting observations with leverage one:
##   272, 326, 583, 666, 1002, 1229, 1274, 1297

## Warning: not plotting observations with leverage one:
##   272, 326, 583, 666, 1002, 1229, 1274, 1297

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

Compare the Coefficients with(lmModel2) or without(lmModel3) the 2 potential outliers (826 and 524). From the result, it can be seen that some of the coefficients really change a lot (some even reach 80%, which is a lot)

library(qpcR)
## Loading required package: MASS
## Loading required package: minpack.lm
## Loading required package: rgl
## Loading required package: robustbase
coefMatrix <- qpcR:::cbind.na(coef(lmModel2), coef(lmModel3))
coefFrame <- data.frame(coefMatrix)
colnames(coefFrame) <- c("With_Outliers", "Without_Outliers")
coefFrame$ChangePercent <- ifelse(coefFrame$Without_Outliers == 0 | coefFrame$With_Outliers == 0, NA, abs((coefFrame$Without_Outliers - coefFrame$With_Outliers) / coefFrame$Without_Outliers))
coefFrame
##                      With_Outliers Without_Outliers ChangePercent
## (Intercept)           5.802523e+00     5.457334e+00   0.063252353
## X.Intercept.                    NA               NA            NA
## MSZoningFV            3.649421e-01     3.684343e-01   0.009478521
## MSZoningRH            3.500045e-01     3.520675e-01   0.005859735
## MSZoningRL            3.735709e-01     3.756993e-01   0.005665049
## MSZoningRM            3.182831e-01     3.188667e-01   0.001830113
## LotArea              -3.304269e-05    -3.054191e-05   0.081880043
## NeighborhoodBlueste  -5.449867e-02    -4.547136e-02   0.198527345
## NeighborhoodBrDale   -1.116499e-01    -1.033227e-01   0.080594059
## NeighborhoodBrkSide   5.553098e-02     6.656711e-02   0.165789533
## NeighborhoodClearCr   5.659732e-02     5.259807e-02   0.076034234
## NeighborhoodCollgCr   2.062999e-02     1.786393e-02   0.154840555
## NeighborhoodCrawfor   1.478859e-01     1.522841e-01   0.028881523
## NeighborhoodEdwards  -3.385192e-02    -2.036974e-02   0.661872399
## NeighborhoodGilbert   4.142402e-02     3.909474e-02   0.059580394
## NeighborhoodIDOTRR    2.986572e-02     4.215798e-02   0.291576069
## NeighborhoodMeadowV  -1.555254e-01    -1.490745e-01   0.043272583
## NeighborhoodMitchel  -2.167774e-02    -2.115731e-02   0.024598199
## NeighborhoodNAmes     7.881489e-03     1.246246e-02   0.367581717
## NeighborhoodNoRidge   5.258912e-02     3.948794e-02   0.331776816
## NeighborhoodNPkVill  -3.213402e-02    -2.662211e-02   0.207042656
## NeighborhoodNridgHt   8.037693e-02     6.310861e-02   0.273628689
## NeighborhoodNWAmes   -1.199636e-02    -1.230746e-02   0.025277117
## NeighborhoodOldTown  -2.501480e-04     9.785906e-03   1.025562066
## NeighborhoodSawyer    5.545489e-03     1.053541e-02   0.473633302
## NeighborhoodSawyerW   5.175668e-03     3.113401e-03   0.662384114
## NeighborhoodSomerst   5.698168e-02     5.108897e-02   0.115342269
## NeighborhoodStoneBr   1.233484e-01     1.171142e-01   0.053231133
## NeighborhoodSWISU     2.516076e-02     3.181628e-02   0.209185964
## NeighborhoodTimber    3.072531e-02     2.544860e-02   0.207347768
## NeighborhoodVeenker   8.349868e-02     8.338408e-02   0.001374275
## Condition1Feedr       1.771357e-02     1.736042e-02   0.020342532
## Condition1Norm        6.647108e-02     6.637618e-02   0.001429806
## Condition1PosA        5.474452e-02     4.984826e-02   0.098223228
## Condition1PosN        7.462238e-02     6.782891e-02   0.100155990
## Condition1RRAe       -4.987018e-02    -5.299316e-02   0.058931784
## Condition1RRAn        3.937796e-02     3.645922e-02   0.080054857
## Condition1RRNe        4.677366e-02     4.231327e-02   0.105413400
## Condition1RRNn        4.750322e-02     4.600635e-02   0.032536218
## Condition2Feedr       1.126415e-01     1.183981e-01   0.048620576
## Condition2Norm        7.516557e-02     7.625750e-02   0.014319068
## Condition2PosA        2.306267e-01     2.122835e-01   0.086408787
## Condition2PosN       -8.847304e-01               NA            NA
## Condition2RRAe       -7.721693e-02    -9.478760e-02   0.185368866
## Condition2RRAn       -4.595932e-02    -5.036556e-02   0.087485102
## Condition2RRNn        5.793708e-02     6.628642e-02   0.125958579
## OverallQual           5.442154e-02     5.336399e-02   0.019817590
## OverallCond           3.704648e-02     3.722643e-02   0.004834073
## YearBuilt             7.966508e-04     9.061082e-04   0.120799553
## RoofMatlCompShg       2.791111e+00     2.897357e+00   0.036669871
## RoofMatlMembran       3.056176e+00     3.182010e+00   0.039545363
## RoofMatlMetal         2.874703e+00     2.990259e+00   0.038644207
## RoofMatlRoll          2.759706e+00     2.859959e+00   0.035054187
## RoofMatlTar.Grv       2.812830e+00     2.913899e+00   0.034685092
## RoofMatlWdShake       2.773803e+00     2.878671e+00   0.036429229
## RoofMatlWdShngl       2.858828e+00     2.956575e+00   0.033060867
## BsmtFinSF1           -3.063859e-05    -1.201921e-05   1.549135009
## BsmtFinSF2           -6.365829e-05    -4.878289e-05   0.304930628
## BsmtUnfSF             5.633787e-05     6.306460e-05   0.106664082
## HeatingQCFa          -3.736130e-02    -3.575467e-02   0.044934952
## HeatingQCGd          -2.208017e-02    -2.141009e-02   0.031297073
## HeatingQCPo          -2.249046e-03     5.037986e-03   1.446417699
## HeatingQCTA          -3.527001e-02    -3.478845e-02   0.013842430
## CentralAirY           7.002059e-02     7.081335e-02   0.011195029
## X1stFlrSF             6.519201e-05     9.011868e-05   0.276598282
## X2ndFlrSF             5.559914e-05     8.106059e-05   0.314103861
## BsmtFullBath          1.306430e-02     1.455307e-02   0.102299613
## KitchenQualFa        -9.053365e-02    -9.100839e-02   0.005216479
## KitchenQualGd        -6.537625e-02    -6.593252e-02   0.008436949
## KitchenQualTA        -7.231225e-02    -7.313849e-02   0.011296897
## FunctionalMaj2       -1.628810e-01    -1.497195e-01   0.087907740
## FunctionalMin1        6.772371e-02     7.607367e-02   0.109761565
## FunctionalMin2        7.183728e-02     7.843453e-02   0.084111545
## FunctionalMod        -3.628051e-02    -3.639894e-02   0.003253706
## FunctionalSev        -3.334320e-01    -3.399889e-01   0.019285517
## FunctionalTyp         1.066228e-01     1.154708e-01   0.076625815
## Fireplaces            2.655747e-02     2.398441e-02   0.107280663
## WoodDeckSF            9.812040e-06    -3.575290e-05   1.274440431
## ScreenPorch           2.124378e-04     2.072564e-04   0.024999973
## SaleConditionAdjLand  1.104008e-01     1.059069e-01   0.042431716
## SaleConditionAlloca   4.382500e-02     4.225328e-02   0.037197531
## SaleConditionFamily   6.564367e-03     5.920782e-03   0.108699171
## SaleConditionNormal   7.118894e-02     7.241642e-02   0.016950244
## SaleConditionPartial  1.264246e-01     1.281413e-01   0.013397075
## BasementSF                      NA               NA            NA
## OneandTwoFloorSF                NA               NA            NA
## FrontSF              -1.117981e-04    -5.703506e-05   0.960165124
## BasementBath          9.120642e-03     5.251275e-03   0.736843416
## Bath                  1.388665e-02     1.093968e-02   0.269383349
## Age                  -1.313474e-03    -1.387582e-03   0.053408510
## YrOfRemodel          -6.994120e-04    -6.950417e-04   0.006287863
## HighQualSF            1.629287e-04     1.543048e-04   0.055888385
## TotalArea             3.498281e-05     3.262412e-05   0.072298894

Predict the Test dataset with the linear regression model (lmModel1, lmModel2 and lmModel3) The score for lmModel1 is 0.13617, which ranks 1181 (All variables considered) The score for lmModel2 is 0.13101, which ranks 1044 (Only variables with high importance and newly engineered features) The score for lmModel3 is 0.12559, which ranks 846 (Only variables with high important and newly engineered features, Exclude outliers)

prediction1 <- predict(lmModel1, test_feed1)
## Warning in predict.lm(lmModel1, test_feed1): prediction from a rank-
## deficient fit may be misleading
prediction2 <- predict(lmModel2, test_feed2)
## Warning in predict.lm(lmModel2, test_feed2): prediction from a rank-
## deficient fit may be misleading
prediction3 <- predict(lmModel3, test_feed2)
## Warning in predict.lm(lmModel3, test_feed2): prediction from a rank-
## deficient fit may be misleading
df1 <- data.frame(cbind(id, exp(prediction1)))
colnames(df1) <- c('Id', 'SalePrice')
df2 <- data.frame(cbind(id, exp(prediction2)))
colnames(df2) <- c('Id', 'SalePrice')
df3 <- data.frame(cbind(id, exp(prediction3)))
colnames(df3) <- c('Id', 'SalePrice')

write.csv(df1, "result1.csv", row.names = FALSE)
write.csv(df2, "result2.csv", row.names = FALSE)
write.csv(df3, "result3.csv", row.names = FALSE)

From this, it can be concluded: 1. With less variables involved in the model, the performance has improved 2. With less outliers, the performance has improved